import os
import sys
import numpy
from Bio import Align
from Bio.Align import substitution_matrices
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

target, number = sys.argv[1:3]
number = int(number)

output_filename = "%s.%d.psl" % (target, number)

assembly = "hg38"


if target in ('chrM', 'rRNA', 'tRNA', 'snRNA', 'scRNA', 'histone', 'RPPH', 'snoRNA', 'scaRNA', 'RMRP', 'yRNA', 'snar', 'vRNA', 'TERC', 'MALAT1', 'snhg', 'mRNA'):
    score_threshold = 0.8
elif target == 'novel':
    score_threshold = 0.9
else:
    raise Exception("Unknown target %s" % target)


def parse_targets(target):
    sequences = {}
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/Filters"
    filename = '%s.fa' % target
    path = os.path.join(directory, filename)
    print("Reading", path)
    handle = open(path)
    records = SeqIO.parse(handle, 'fasta')
    for record in records:
        name = record.id
        sequence = str(record.seq).upper()
        sequences[name] = sequence
    handle.close()
    return sequences

target_sequences = parse_targets(target)


# blat scoring parameters
substitution_matrix = substitution_matrices.Array("ACGTN", dims=2)
substitution_matrix[:,:] = -1
numpy.fill_diagonal(substitution_matrix, +1)
substitution_matrix['N', :] = 0
substitution_matrix[:, 'N'] = 0
aligner = Align.PairwiseAligner()
aligner.gap_score = -1
aligner.query_end_gap_score = 0
aligner.substitution_matrix = substitution_matrix


delta = 1.e-6

filename = "seqlist_%d.fa" % number
print("Reading", filename)
handle = open(filename)
records = SeqIO.parse(handle, 'fasta')
print("Writing", output_filename)
output = open(output_filename, 'w')
for record in records:
    query_name = record.id
    terms = query_name.split("_")
    assert terms[0] == 'seq'
    number = int(terms[1])
    query_sequence = str(record.seq)
    query_length = len(query_sequence)
    minimum_score = score_threshold * query_length
    lines = []
    for target_name in target_sequences:
        target_sequence = target_sequences[target_name]
        score = aligner.score(target_sequence, query_sequence)
        if score < minimum_score - delta:
            continue
        if score > minimum_score + delta:
            lines = []
            minimum_score = score
        alignments = aligner.align(target_sequence, query_sequence)
        assert alignments.score == score
        target_record = SeqRecord(seq=target_sequence, id=target_name)
        query_record = SeqRecord(seq=query_sequence, id=query_name)
        alignments = list(alignments)
        for alignment in alignments:
            alignment.target = target_record
            alignment.query = query_record
            line = format(alignment, 'psl')
            words = line.split()
            assert len(words) == 21
            matches = int(words[0])
            misMatches = int(words[1])
            qBaseInsert = int(words[5])
            tBaseInsert = int(words[7])
            qSize = int(words[10])
            qStart = int(words[11])
            qEnd = int(words[12])
            assert matches - misMatches - qBaseInsert - tBaseInsert - qStart - (qSize - qEnd) == score
            lines.append(line)
    for line in lines:
        output.write(line)
handle.close()
output.close()
print("Done")
